1 Performing RNA-seq analysis using the DESeq2 package

For this walk-through we will be using the same example (and much of the same code!) as in this chapter - https://uclouvain-cbio.github.io/WSBIM2122/sec-rnaseq.html

For a basic intro to R and dplyr, please watch this series of videos made by Duke’s Center for Computational Thinking - https://warpwire.duke.edu/w/f0YGAA/

To learn more about the DESeq2 package, please refer to this wonderful guide - http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#exploratory-analysis-and-visualization

2 Load and Inspect Data

# system.file() finds the location of the specified file in the specified package
# load() then opens the specified file (`.rda` objects in this case) so that they appear in our global environment 
load(system.file("extdata/deseq2/counts.rda",
                  package = "rWSBIM2122"))
load(system.file("extdata/deseq2/coldata.rda",
                  package = "rWSBIM2122"))
# you should now see two new objects in your Environment pane (top right-hand corner)

# `coldata` is a small dataframe, so just type its name in the console and press enter to see its contents
coldata
# `counts` is a much larger object. How many rows and columns does it have?
dim(counts)
## [1] 58735     6
# Take a look at the first few rows of `counts`
head(counts)
# Looking at these two dataframes, what kind of analysis/comparison would you perform?

3 Construct a DESeqDataSet object

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = coldata,
                              design = ~ Condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# How do we learn more about this new function, DESeqDataSetFromMatrix() from the DESeq2 package
?DESeq2::DESeqDataSetFromMatrix
## starting httpd help server ...
##  done
# take a look at this new `dds` object
dds
## class: DESeqDataSet 
## dim: 58735 6 
## metadata(1): version
## assays(1): counts
## rownames(58735): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
##   ENSG00000268674
## rowData names(0):
## colnames(6): sample1 sample2 ... sample5 sample6
## colData names(3): Cell Type Condition
# `dds` is a new type formally known as an "S4 object"
# S4 objects have slots that can be accessed by using the `@` symbol
# to see our two original objects `coldata` and `counts`, we could run the following
dds@colData
## DataFrame with 6 rows and 3 columns
##                Cell        Type Condition
##         <character> <character>  <factor>
## sample1       Cell1  Epithelial      mock
## sample2       Cell1  Epithelial      mock
## sample3       Cell1  Epithelial      mock
## sample4       Cell1  Epithelial      KD  
## sample5       Cell1  Epithelial      KD  
## sample6       Cell1  Epithelial      KD
head(dds@assays@data[[1]])
##                 sample1 sample2 sample3 sample4 sample5 sample6
## ENSG00000223972       0       0       0       0       0       1
## ENSG00000227232      14      28      17      40      16      13
## ENSG00000278267       8       4       3       1       1       6
## ENSG00000243485       0       0       0       0       0       0
## ENSG00000284332       0       0       0       0       0       0
## ENSG00000237613       0       0       0       0       0       0
# There are functions that allow us to access elements of `dds` in a more intuitive way
# use the `counts()` function to access the original `counts` dataframe
head(counts(dds))
##                 sample1 sample2 sample3 sample4 sample5 sample6
## ENSG00000223972       0       0       0       0       0       1
## ENSG00000227232      14      28      17      40      16      13
## ENSG00000278267       8       4       3       1       1       6
## ENSG00000243485       0       0       0       0       0       0
## ENSG00000284332       0       0       0       0       0       0
## ENSG00000237613       0       0       0       0       0       0
# we can also use the `assay()` function for this

# use the `colData()` function to access the original (almost!) `coldata` dataframe
colData(dds)
## DataFrame with 6 rows and 3 columns
##                Cell        Type Condition
##         <character> <character>  <factor>
## sample1       Cell1  Epithelial      mock
## sample2       Cell1  Epithelial      mock
## sample3       Cell1  Epithelial      mock
## sample4       Cell1  Epithelial      KD  
## sample5       Cell1  Epithelial      KD  
## sample6       Cell1  Epithelial      KD

4 Run DESeq2

4.1 Run DESeq and inspect results

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# this function does a lot!
# let's ignore all of this for now and take a look at the results :)
head(results(dds))
## log2 fold change (MLE): Condition mock vs KD 
## Wald test p-value: Condition mock vs KD 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972  0.164582      -0.964328  4.080473 -0.236327  0.813179
## ENSG00000227232 21.005516      -0.295976  0.539816 -0.548290  0.583493
## ENSG00000278267  4.127792       1.035234  1.142151  0.906390  0.364729
## ENSG00000243485  0.000000             NA        NA        NA        NA
## ENSG00000284332  0.000000             NA        NA        NA        NA
## ENSG00000237613  0.000000             NA        NA        NA        NA
##                      padj
##                 <numeric>
## ENSG00000223972        NA
## ENSG00000227232  0.740534
## ENSG00000278267  0.555793
## ENSG00000243485        NA
## ENSG00000284332        NA
## ENSG00000237613        NA

4.2 Re-order levels and re-run DEseq

# Notice that the log2FoldChange has the "KD" group as the base comparison
# why is this?
dds$Condition
## [1] mock mock mock KD   KD   KD  
## Levels: KD mock
class(dds$Condition)
## [1] "factor"
# `Condition` is a factor, meaning that it is a categorical variable, which is a variable that contains a fixed number of categories or groups that a given observation can belong to
# The number of levels of a given factor refer to the number of categories this variable contains
# The order of these levels determines how comparisons are made in statistical modeling
# In the case of `Condition`, "KD" appears first and so is the base comparison group
# To switch this order, we can run this - 
coldata$Condition <- factor(coldata$Condition, levels = c("mock", "KD"))

# check levels
coldata$Condition
## [1] mock mock mock KD   KD   KD  
## Levels: mock KD
# levels have been switched!

# create dds object again
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = coldata,
                              design = ~ Condition) 
## converting counts to integer mode
# run DESeq again
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
head(res)
## log2 fold change (MLE): Condition KD vs mock 
## Wald test p-value: Condition KD vs mock 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972  0.164582       0.964295  4.080473  0.236319  0.813185
## ENSG00000227232 21.005516       0.295977  0.539816  0.548293  0.583491
## ENSG00000278267  4.127792      -1.035232  1.142151 -0.906388  0.364730
## ENSG00000243485  0.000000             NA        NA        NA        NA
## ENSG00000284332  0.000000             NA        NA        NA        NA
## ENSG00000237613  0.000000             NA        NA        NA        NA
##                      padj
##                 <numeric>
## ENSG00000223972        NA
## ENSG00000227232  0.740531
## ENSG00000278267  0.555790
## ENSG00000243485        NA
## ENSG00000284332        NA
## ENSG00000237613        NA

4.3 Visualize with MA-plot

# visualize results using plotMA()
plotMA(res)

4.4 Shrink log fold-changes for low-count genes

res_shrunk <- lfcShrink(dds, 
                        coef = "Condition_KD_vs_mock",
                        type = "apeglm") 
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Here's a nice article on empirical Bayes estimation - http://varianceexplained.org/r/empirical_bayes_baseball/

plotMA(res_shrunk)

# compare this plot with the previous one. How are they different?

4.5 Convert to dataframe and clean results table

# convert to dataframe
res_df <- as_tibble(res_shrunk, rownames = "ENSEMBL") # if we forget `rownames = `, we lose all the gene names!

# How would you remove all the rows that have `NAs` for the `padj` column
# We have a couple of options
# clue -
head(is.na(res_df$padj)) # gives us a bunch of TRUEs and FALSEs. We can use this to filter out the unwanted rows
## [1]  TRUE FALSE FALSE  TRUE  TRUE  TRUE
res_df_nona <- res_df %>% 
                filter(!is.na(padj))

# or we could use the `drop_na()` function
res_df_nona <- res_df %>% 
                  drop_na(padj)

# how to verify that there are no missing values for `padj`?
sum(is.na(res_df_nona$padj)) # this should be 0!
## [1] 0
res_df <- res_df_nona

4.6 Make a Volcano Plot

To learn how to use ggplot2, refer to this video - https://www.youtube.com/watch?v=WUwSVKasU9g

res_df %>%
  ggplot(aes(x = log2FoldChange, y = -log10(padj),
             color = padj < 0.05 & abs(log2FoldChange) > 1)) +
  geom_point(size = 0.5) +
  geom_hline(yintercept = -log10(0.05)) +
  geom_vline(xintercept = 1) +
  geom_vline(xintercept = -1) +
  theme(legend.position = "none")

5 Over-representation Analysis (ORA)

For a more detailed explanation of enrichment analyses (including ORA and gene set enrichment analysis), please watch this video - https://youtu.be/ZgZKmAYm-LE

5.1 Retrieve Entrez IDs and Gene Symbols

# Get ENTREZ IDs
# We'll need this for using the "org.Hs.eg.db" package
# Step One: Connect to the selected BioMart database and dataset hosted by Ensembl

ensembl <- useEnsembl(biomart = "genes", 
                   dataset = "hsapiens_gene_ensembl")

# Step Two: Retrieve gene names
# build a biomaRt query
# The getBM() function is the main query function in biomaRt
ensembl_to_entrez <- getBM(attributes = c("ensembl_gene_id", "external_gene_name",
                                            "entrezgene_id"),
                             values = res_df$ENSEMBL,
                             mart = ensembl)

# Plan B in case there's a connection problem -
# ensembl_to_entrez <- read_csv("https://raw.githubusercontent.com/dukecct/CBRG/main/inst/data/ensembl_to_entrez.csv")

# add this new info to res_df_nona
res_df <- res_df %>% 
            left_join(ensembl_to_entrez, by = c("ENSEMBL" = "ensembl_gene_id"))
## Warning in left_join(., ensembl_to_entrez, by = c(ENSEMBL = "ensembl_gene_id")): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 9 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.

5.2 Clean res_df

# remove rows with NAs in the columns in `entrezgene_id` and `padj`
res_df <- res_df %>% 
            drop_na(entrezgene_id, padj)

# are all ENTREZ IDs unique?
length(unique(res_df$entrezgene_id)) # 13711
## [1] 13886
nrow(res_df) # 13730
## [1] 13910
# drop duplicates
res_df <- res_df %>% 
            arrange(padj) %>% 
            distinct(entrezgene_id, .keep_all = TRUE)

# are all ENTREZ IDs unique?
length(unique(res_df$entrezgene_id)) == nrow(res_df) # TRUE
## [1] TRUE

5.3 Perform ORA on desired subset

# perform ORA
# we need a vector of ENTREZ IDs for genes with padj values < 0.05
sig_genes <- res_df %>% 
              filter(padj < 0.05, log2FoldChange > 1) %>% 
              pull(entrezgene_id)
head(sig_genes)
## [1] 79413 55270  1474  7533 54974  2013
go_ora <- enrichGO(gene = as.character(sig_genes),
                   OrgDb = org.Hs.eg.db,
                   universe = as.character(res_df$entrezgene_id),
                   ont = "MF",
                   readable = TRUE) # maps gene IDs to gene names
head(go_ora)
# Visualization
# dot plot
go_ora %>% 
  dotplot(showCategory = 30) + # showCategory   = number of enriched terms to display
  ggtitle("dotplot for ORA")

# or make your own plot!
ora_plot_interactive <- go_ora@result %>%
  dplyr::slice(1L:20L) %>% # show the first 20 enriched terms
  dplyr::mutate(GeneRatio = sapply(GeneRatio, function(x) eval(parse(text = x)))) %>% # compute decimal ratios) %>%
  # NOTE: Here we have switch from the pipe (%>%) to the "+" sign because ggplot2 only uses "+" (so far...)
  ggplot(aes(x = reorder(ID, GeneRatio), 
             y = GeneRatio, 
             fill = p.adjust,
             label = Description)) +
  geom_col() +
  coord_flip() +
  labs(x = "Enriched Term",
       y = "Gene Ratio") +
  theme_minimal()

plotly::ggplotly(ora_plot_interactive) # plotly CRAN package allows us to create interactive ggplot2 plots very easy!